alt text

📚 Importing Libraries 📚

In [1]:
import pandas as pd
import numpy as np


# Visulaiztion Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from yellowbrick.regressor import ResidualsPlot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

# For Saving Files
import os

# Model Training And Testing libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import  confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# Best Features Selection For Each Category Libraries
from sklearn.feature_selection import SelectKBest, f_regression

# Saving Model
import pickle

# Profiling Libraries
from ydata_profiling import ProfileReport

⏳ Loading the dataset ⏳

In [2]:
movies = pd.read_csv(r"D:\Projects\Python\CodeSoft Internship\Movie Project\IMDb Movies India.csv", encoding='latin1')

🧠 1. Understanding of data 🧠

1.1 | Profile Report of Data

In [5]:
profile = ProfileReport(movies, title="Movies Profiling Report", explorative=True)
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[5]:

In [67]:
# Save the report to the specified path
profile.to_file(r"D:\Projects\CodeSoft Internship\Movie Project\IMDb Movies India.csv")
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
D:\Anaconda\Lib\site-packages\ydata_profiling\profile_report.py:360: UserWarning: Extension .csv not supported. For now we assume .html was intended. To remove this warning, please use .html or .json.
  warnings.warn(
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

1.2 | First 5 Rows

In [69]:
movies.head()
Out[69]:
Name Year Duration Genre Rating Votes Director Actor 1 Actor 2 Actor 3
0 NaN NaN Drama NaN NaN J.S. Randhawa Manmauji Birbal Rajendra Bhatia
1 #Gadhvi (He thought he was Gandhi) (2019) 109 min Drama 7.0 8 Gaurav Bakshi Rasika Dugal Vivek Ghamande Arvind Jangid
2 #Homecoming (2021) 90 min Drama, Musical NaN NaN Soumyajit Majumdar Sayani Gupta Plabita Borthakur Roy Angana
3 #Yaaram (2019) 110 min Comedy, Romance 4.4 35 Ovais Khan Prateik Ishita Raj Siddhant Kapoor
4 ...And Once Again (2010) 105 min Drama NaN NaN Amol Palekar Rajat Kapoor Rituparna Sengupta Antara Mali

1.3 | Info & Shape of Dataset

In [4]:
movies.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
In [37]:
movies.shape
Out[37]:
(15509, 10)
In [38]:
movies.columns
Out[38]:
Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

1.2 | Saving Features Counts As Percentages

In [6]:
def percent_counts(df, feature):
    total = df[feature].value_counts(dropna=False)
    percent = round(df[feature].value_counts(dropna=False, normalize=True) * 100, 2)
    percent_count = pd.concat([total, percent], keys=['Total', 'Percentage'], axis=1)
    return percent_count
In [8]:
percent_counts(movies, 'Genre')
Out[8]:
Total Percentage
Drama 2780 17.93
NaN 1877 12.10
Action 1289 8.31
Thriller 779 5.02
Romance 708 4.57
... ... ...
Action, Musical, War 1 0.01
Horror, Crime, Thriller 1 0.01
Animation, Comedy 1 0.01
Romance, Action, Crime 1 0.01
Adventure, Fantasy, Sci-Fi 1 0.01

486 rows × 2 columns

In [9]:
percent_counts(movies, 'Rating')
Out[9]:
Total Percentage
NaN 7590 48.94
6.2 269 1.73
6.8 264 1.70
6.5 254 1.64
6.6 239 1.54
... ... ...
9.7 1 0.01
1.4 1 0.01
10.0 1 0.01
9.6 1 0.01
1.1 1 0.01

85 rows × 2 columns

In [ ]:
# Path to save the file
output_path = r"C:\Users\acer\Downloads\CodeSoft Internship\Movie Project\movies_percent_counts.xlsx"

# Create a Pandas Excel writer using XlsxWriter as the engine.
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    for feature in movies.columns:
        df_counts = percent_counts(movies, feature)
        
        # Convert the DataFrame to an XlsxWriter Excel object.
        df_counts.to_excel(writer, sheet_name=feature)

print(f"Report saved to {output_path}")

🧹2. | Data Cleaning 🧹

2.1 | Handling Null Values

In [3]:
# Checking for missing values
missing_values = movies.isnull().sum()

# Print missing values
print("Missing values: \n")
missing_values
Missing values: 

Out[3]:
Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

Removing rows with missing values in columns from 1 to 9¶

In [10]:
#Locating rows with missing values in columns from 1 to 9

nulls = movies[movies.iloc[:, 1:9].isnull().all(axis=1)]
nulls.head(7)
Out[10]:
Name Year Duration Genre Rating Votes Director Actor 1 Actor 2 Actor 3
1836 Bang Bang Reloaded NaN NaN NaN NaN NaN NaN NaN NaN NaN
1920 Battle of bittora NaN NaN NaN NaN NaN NaN NaN NaN NaN
2653 Campus NaN NaN NaN NaN NaN NaN NaN NaN NaN
3403 Dancing Dad NaN NaN NaN NaN NaN NaN NaN NaN NaN
3807 Dial 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4271 Durga Rani Singh NaN NaN NaN NaN NaN NaN NaN NaN NaN
8669 Main Hoon Kaun NaN NaN NaN NaN NaN NaN NaN NaN NaN

Removing Null Values¶

In [14]:
# Handling the null values
movies.dropna(subset=['Name', 'Year', 'Genre', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)

2.2 | Extracting Data

In [12]:
# Extracting only the text part from the 'Name' column using a different method
movies['Name'] = movies['Name'].str.extract(r'([A-Za-z\s\'\-]+)')

# Replacing the brackets from 'Year' column and converting to int using different method
movies['Year'] = movies['Year'].apply(lambda x: int(x.strip('()')))

# Converting 'Duration' to numeric and replacing 'min' while keeping only numerical part using different method
movies['Duration'] = movies['Duration'].str.replace(' min', '').astype(float)

# Converting 'Votes' to numeric and replacing commas using different method
movies['Votes'] = movies['Votes'].apply(lambda x: float(x.replace(',', '')))

# Splitting the genre by ',' to keep only unique genres and replacing the null values with mode
movies['Genre'] = movies['Genre'].str.split(', ')
movies = movies.explode('Genre')
movies['Genre'].fillna(movies['Genre'].mode()[0], inplace=True)

2.3 | Handling Duplicates Values

In [72]:
# Check for duplicate rows in the DataFrame
data_dup = movies.duplicated().any()
print(f"Are there duplicate rows in the dataset? {data_dup}")

# Find and print columns with duplicate values and their counts
duplicate_columns = {}
for column in movies.columns:
    duplicates = movies[column].duplicated(keep=False)
    if duplicates.any():
        duplicate_counts = movies[column].value_counts()[movies[column].value_counts() > 1]
        duplicate_columns[column] = duplicate_counts

# Print the columns with their duplicate values and counts
for column, counts in duplicate_columns.items():
    print(f"\nColumn: {column}")
    
Are there duplicate rows in the dataset? True

Column: Name

Column: Year

Column: Duration

Column: Genre

Column: Rating

Column: Votes

Column: Director

Column: Actor 1

Column: Actor 2

Column: Actor 3
In [38]:
# Convert the dictionary to a DataFrame
df_duplicate_info = pd.DataFrame.from_dict(duplicate_columns, orient='index').transpose()

# Save the DataFrame to an Excel file
output_path = r"C:\Users\acer\Downloads\CodeSoft Internship\Duplicate_columns.xlsx"
df_duplicate_info.to_excel(output_path, index=False)

print(f"Duplicate columns information saved to {output_path}")
Duplicate columns information saved to C:\Users\acer\Downloads\CodeSoft Internship\Duplicate_columns.xlsx
In [13]:
movies = movies.drop_duplicates(subset=['Name', 'Year', 'Genre', 'Rating', 'Votes', 'Rating', 'Director', 'Actor 1', 'Actor 2', 'Actor 3' ], keep=False, inplace=False)

2.4 | Unique Values

In [19]:
dict = {}
for i in list(movies.columns):
    dict[i] = movies[i].value_counts().shape[0]

unique = pd.DataFrame(dict,index=["unique count"]).transpose()
unique
Out[19]:
unique count
Name 5282
Year 91
Duration 174
Genre 22
Rating 83
Votes 2027
Director 2431
Actor 1 1960
Actor 2 2321
Actor 3 2556
In [80]:
# Check for typos and use more descriptive variable names
movies_obj = movies.select_dtypes(include="object")

with pd.ExcelWriter(r"C:\Users\acer\Downloads\CodeSoft Internship\Movie Project\movie_column_unique_values.xlsx") as writer:
    movies_obj.to_excel(writer, sheet_name="unique_values")

print("Report saved to Excel file")
Report saved to Excel file

2.5 | Separating Continuous & Categorical Values

In [17]:
continuous_values = []
categorical_values = []

for column in movies.columns:
    if movies[column].dtype == 'int64' or movies[column].dtype == 'float64':
        continuous_values.append(column)
    else:
        categorical_values.append(column)
In [7]:
print(f"Categorical columns: {categorical_values}")
print(f"Continuous columns: {continuous_values}")
Categorical columns: ['Name', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
Continuous columns: ['Year', 'Duration', 'Rating', 'Votes']
In [13]:
movies.head()
Out[13]:
Name Year Duration Genre Rating Votes Director Actor 1 Actor 2 Actor 3
1 Gadhvi 2019 109 Drama 7.0 8 Gaurav Bakshi Rasika Dugal Vivek Ghamande Arvind Jangid
3 Yaaram 2019 110 Comedy 4.4 35 Ovais Khan Prateik Ishita Raj Siddhant Kapoor
3 Yaaram 2019 110 Romance 4.4 35 Ovais Khan Prateik Ishita Raj Siddhant Kapoor
5 Aur Pyaar Ho Gaya 1997 147 Comedy 4.7 827 Rahul Rawail Bobby Deol Aishwarya Rai Bachchan Shammi Kapoor
5 Aur Pyaar Ho Gaya 1997 147 Drama 4.7 827 Rahul Rawail Bobby Deol Aishwarya Rai Bachchan Shammi Kapoor
In [15]:
movies.isnull().sum()
Out[15]:
Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

2.6 | Handling Outliers

In [18]:
def outlier_detect(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    return df[((df[col] < (q1_col - 1.5 * iqr_col)) |(df[col] > (q3_col + 1.5 * iqr_col)))]

# ---------------------------------------------------------
def outlier_detect_categorical(df, col):
    if df[col].dtype == 'object':
        # For categorical columns, check for unique or unusual values
        value_counts = df[col].value_counts()
        return value_counts[value_counts < 5]  # Consider values with less than 5 occurrences as outliers
    else:
        # For numerical columns, use IQR method
        q1_col = Q1[col]
        iqr_col = IQR[col]
        q3_col = Q3[col]
        return df[((df[col] < (q1_col - 1.5 * iqr_col)) | (df[col] > (q3_col + 1.5 * iqr_col)))]

# ---------------------------------------------------------
def outlier_detect_normal(df, col):
    m = df[col].mean()
    s = df[col].std()
    return df[((df[col]-m)/s).abs()>3]

# ---------------------------------------------------------
def lower_outlier(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    lower = df[(df[col] < (q1_col - 1.5 * iqr_col))]
    return lower

# ---------------------------------------------------------
def upper_outlier(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    upper = df[(df[col] > (q3_col + 1.5 * iqr_col))]
    return upper

# ---------------------------------------------------------
def replace_upper(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    tmp = 9999999
    upper = q3_col + 1.5 * iqr_col
    df[col] = df[col].where(lambda x: (x < (upper)), tmp)
    df[col] = df[col].replace(tmp, upper)
    print('outlier replace with upper bound - {}' .format(col)) 
    
# ---------------------------------------------------------
def replace_lower(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    tmp = 1111111
    lower = q1_col - 1.5 * iqr_col
    df[col] = df[col].where(lambda x: (x > (lower)), tmp)
    df[col] = df[col].replace(tmp, lower)
    print('outlier replace with lower bound - {}' .format(col)) 

# ---------------------------------------------------------


Q1 = movies.quantile(0.25, numeric_only=True)
Q3 = movies.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1
for i in range(len(continuous_values)):
    print("IQR => {}: {}".format(continuous_values[i], outlier_detect(movies, continuous_values[i]).shape[0]))
    print("Z_Score => {}: {}".format(continuous_values[i], outlier_detect_normal(movies, continuous_values[i]).shape[0]))
    print("********************************")
IQR => Year: 26
Z_Score => Year: 40
********************************
IQR => Duration: 174
Z_Score => Duration: 101
********************************
IQR => Rating: 60
Z_Score => Rating: 19
********************************
IQR => Votes: 1882
Z_Score => Votes: 136
********************************
In [19]:
outlier = []
for i in range(len(continuous_values)):
    if outlier_detect(movies[continuous_values],continuous_values[i]).shape[0] !=0:
        outlier.append(continuous_values[i])

outlier



for i in range(len(outlier)):
    replace_upper(movies, outlier[i]) 
    
print("\n********************************\n")
for i in range(len(outlier)):
    replace_lower(movies, outlier[i])

Q1 = movies.quantile(0.25, numeric_only=True)
Q3 = movies.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1
for i in range(len(continuous_values)):
    print("IQR => {}: {}".format(continuous_values[i], outlier_detect(movies, continuous_values[i]).shape[0]))
    print("Z_Score => {}: {}".format(continuous_values[i], outlier_detect_normal(movies, continuous_values[i]).shape[0]))
    print("********************************")
outlier replace with upper bound - Year
outlier replace with upper bound - Duration
outlier replace with upper bound - Rating
outlier replace with upper bound - Votes

********************************

outlier replace with lower bound - Year
outlier replace with lower bound - Duration
outlier replace with lower bound - Rating
outlier replace with lower bound - Votes
IQR => Year: 0
Z_Score => Year: 40
********************************
IQR => Duration: 0
Z_Score => Duration: 0
********************************
IQR => Rating: 0
Z_Score => Rating: 0
********************************
IQR => Votes: 0
Z_Score => Votes: 0
********************************

🎬 Movie Dataset Exploration¶

📊 Overview of the Dataset¶

  • Total Variables: 10
  • Total Entries: 15,509
  • Memory Usage: 8.2 MiB
  • Average Record Size: 555.0 B

🔢 Unique Value Insights¶

  • Unique Movie Titles: 5,282
  • Distinct Years Represented: 91
  • Varied Durations: 174 unique lengths
  • Diverse Genres: 22
  • Distinct Ratings: 83
  • Different Vote Counts: 2,027
  • Directors: 2,431 unique directors
  • Actors:
    • Lead Actors: 1,960
    • Supporting Actor 1: 2,321
    • Supporting Actor 2: 2,556

🌟 Most Frequent Entries¶

  • Top Movie: Gadhvi
  • Top Genre: Drama (1,617 entries)
  • Top Director: K (115 entries)
  • Top Actor 1: Jeetendra (82 entries)
  • Top Actor 2: Rekha (44 entries)
  • Top Actor 3: Pran (46 entries)

🔍 Detailed Observations¶

  1. Richness in Diversity:

    • The diversity in genres, directors, and actors presents a fantastic opportunity to explore various dimensions of the movie industry.
  2. Year Range:

    • With 91 distinct years, the dataset spans a significant period, allowing for historical trend analysis in the film industry.
  3. Vote and Rating Analysis:

    • The variability in votes and ratings can provide insights into movie popularity and audience reception over time.
  4. Duration Analysis:

    • Examining the range of movie durations can help identify trends in film length preferences across different eras.

✨ Let the cinematic exploration begin! 🌟

🎬 Movie Pre-Processing Insights¶

📊 Outlier Replacement Summary¶

  • Outliers Identified Using:

    • Interquartile Range (IQR)
    • Z-Score
  • Outliers Replaced with Upper and Lower Bounds for:

    • Year
    • Duration
    • Rating
    • Votes

🧼 Data Cleaning and Transformation¶

  1. Null Value Handling:

    • Dropped rows with missing values in critical columns: Name, Year, Genre, Duration, Rating, Votes, Director, Actor 1, Actor 2, Actor 3.
  2. Text Extraction:

    • Extracted pure text from the Name column using regex to remove any non-alphabetic characters.
  3. Year Formatting:

    • Stripped brackets from the Year column and converted the values to integers for consistency.
  4. Duration Conversion:

    • Removed 'min' from the Duration column and converted it to a numeric format, enabling easier analysis of movie lengths.
  5. Votes Conversion:

    • Removed commas from the Votes column and converted it to a numeric format to facilitate statistical operations.
  6. Genre Normalization:

    • Split genres by commas and exploded the list into individual rows, ensuring each genre is treated uniquely.
    • Filled missing genres with the most common genre (mode).

🗑️ Data Deduplication¶

  • Removed duplicate rows based on a combination of key columns: Name, Year, Genre, Rating, Votes, Director, Actor 1, Actor 2, Actor 3.

🧐 Interesting Findings¶

  • The dataset initially had 21.6% missing values, posing a significant challenge for analysis.
  • Genres showed a vibrant mix, with 22 distinct types, highlighting the diverse nature of movie themes.
  • A rich variety of directors and actors (both lead and supporting) indicates a comprehensive dataset suitable for in-depth film industry analysis.
  • Post-cleaning, the dataset has become a reliable source for exploring movie trends, ratings, and other metrics.

📝 Conclusion¶

  • The cleaned dataset, free from duplicates and missing values, is now poised for a wide range of analyses, from genre popularity trends to director and actor career trajectories.
  • With detailed and accurate transformations, this dataset can yield meaningful insights into the dynamics of the movie industry, aiding both academic research and industry strategy formulation.

✨ The dataset is now prepared for detailed analysis, providing a reliable foundation for exploring trends and patterns in the movie industry! 🌟

📊 3. | Exploratory Data Analysis 📊

3.1 | Statistics of Data

In [35]:
movies.describe()
Out[35]:
Year Duration Rating Votes Genre_mean_rating Director_encoded Actor1_encoded Actor2_encoded Actor3_encoded
count 11963.000000 11963.000000 11963.000000 11963.000000 11963.000000 11963.000000 11963.000000 11963.000000 11963.000000
mean 1996.077656 135.868762 5.878851 3333.946418 5.878851 5.878851 5.878851 5.878851 5.878851
std 19.317122 25.027876 1.374141 15314.164524 0.309586 1.098326 1.001887 1.033647 1.046030
min 1931.000000 21.000000 1.100000 5.000000 4.811000 1.600000 1.400000 1.600000 1.700000
25% 1982.000000 120.000000 5.000000 41.000000 5.749888 5.246154 5.292857 5.267857 5.279167
50% 2001.000000 137.000000 6.100000 212.000000 5.818421 6.000000 6.000000 5.945455 5.960000
75% 2012.000000 152.000000 6.900000 1457.000000 6.055878 6.700000 6.575281 6.561818 6.575410
max 2021.000000 321.000000 10.000000 591417.000000 8.000000 10.000000 10.000000 10.000000 10.000000
In [99]:
movies.describe(include='object')
Out[99]:
Name Genre Director Actor 1 Actor 2 Actor 3
count 4990 4990 4990 4990 4990 4990
unique 4990 21 2149 1853 2206 2412
top Gadhvi Drama K Jeetendra Rekha Pran
freq 1 1617 115 82 44 46

3.2 | Top & Bottom 3 Movies By Different Criteria

In [124]:
# Sort the DataFrame by 'Votes' column in descending order
sorted_movies = movies.sort_values(by='Votes', ascending=False)

# Get top 3 movies with the highest number of votes
top_3_highest_votes = sorted_movies.head(3)

# Get bottom 3 movies with the lowest number of votes
bottom_3_lowest_votes = sorted_movies.tail(3)

# Convert top 3 movies with the highest votes to DataFrame
top_3_df = pd.DataFrame(top_3_highest_votes[['Name', 'Votes', 'Year', 'Rating']])

# Convert bottom 3 movies with the lowest votes to DataFrame

bottom_3_df = pd.DataFrame(bottom_3_lowest_votes[['Name', 'Votes', 'Year', 'Rating']])

1. By Votes

In [126]:
print("\nTop 3 Movies By Votes")
top_3_df
Top 3 Movies By Votes
Out[126]:
Name Votes Year Rating
8219 Life of Pi 591417 2012 7.9
75 Idiots 357889 2009 8.4
8233 Lion 220526 2016 8.0

2. Bottom 3 By Votes

In [127]:
print("\nBottom 3 Movies By Votes")
bottom_3_df
Bottom 3 Movies By Votes
Out[127]:
Name Votes Year Rating
11628 Rakshaa Bandhan 5 1977 7.6
3361 Daku Kali Bhawani 5 2000 3.8
4384 Ek Daku Saher Mein 5 1985 3.8
In [129]:
# Sort the DataFrame by 'Votes' column in descending order
sorted_movies = movies.sort_values(by='Rating', ascending=False)

# Get top 3 movies with the highest number of votes
top_3_highest_votes = sorted_movies.head(3)

# Get bottom 3 movies with the lowest number of votes
bottom_3_lowest_votes = sorted_movies.tail(3)

# Convert top 3 movies with the highest votes to DataFrame
top_3_df = pd.DataFrame(top_3_highest_votes[['Name', 'Rating', 'Year', 'Votes']])

# Convert bottom 3 movies with the lowest votes to DataFrame
bottom_3_df = pd.DataFrame(bottom_3_lowest_votes[['Name', 'Rating', 'Year', 'Votes']])

1. Top 3 By Rating

In [131]:
print("\nTop 3 Movies By Rating")
top_3_df
Top 3 Movies By Rating
Out[131]:
Name Rating Year Votes
8339 Love Qubool Hai 10.0 2020 5
5410 Half Songs 9.7 2021 7
14222 The Reluctant Crime 9.4 2020 16

2. Bottom 3 By Rating

In [130]:
print("\nBottom 3 Movies By Rating")
bottom_3_df
Bottom 3 Movies By Rating
Out[130]:
Name Rating Year Votes
15040 Welcome to New York 1.6 2018 774
3618 Desh Drohi 1.4 2008 3899
12171 Sadak 1.1 2020 67785

3.3 | Top Rated Movies In Different Years

In [143]:
# Group movies by year
movies_by_year = movies.groupby('Year')

# Apply a function to find the movie with the highest rating within each year group
def get_top_rated_movie(group):
  return group.sort_values(by='Rating', ascending=False).head(1)

# Apply the function to each year group and store the result in a new DataFrame
top_rated_movies_per_year = movies_by_year.apply(get_top_rated_movie)

# Reset the index to remove the multi-level indexing
top_rated_movies_per_year = top_rated_movies_per_year.reset_index(drop=True)
top_rated_movies_per_year
Out[143]:
Name Year Duration Genre Rating Votes Director Actor 1 Actor 2 Actor 3
0 The Light of the World 1931 124 Drama 6.2 112 Ardeshir Irani Master Vithal Zubeida Jillo
1 Indrasabha 1932 211 Musical 6.0 12 J Nissar Jehanara Kajjan Abdul Rehman Kabuli
2 Puran Bhagat 1933 159 Unknown 6.5 10 Debaki Bose Choudhury Mohammed Rafiq M Anwari
3 Chandidas 1934 128 Unknown 6.6 5 Nitin Bose K Umasashi Pahadi Sanyal
4 Inquilab 1935 144 Drama 7.4 38 Debaki Bose Durga Khote Prithviraj Kapoor Syed Mohammed
... ... ... ... ... ... ... ... ... ... ...
86 Rediscovering India 2017 124 Documentary 9.0 62 Meenal Dixit Benny John Benny John Meenal Dixit
87 Ashok Vatika 2018 97 Drama 9.3 7 Rahul Mallick Kunj Anand Sanjay Bishnoi Paras Zutshi
88 Gho Gho Rani 2019 105 History 9.4 47 Munni Pankaj Nishi Neha Mishra Pankaj Kamal Akash Kumar
89 Love Qubool Hai 2020 94 Drama 10.0 5 Saif Ali Sayeed Ahaan Jha Mahesh Narayan Rajasree Rajakumari
90 Half Songs 2021 79 Music 9.7 7 Sriram Raja Raj Banerjee Emon Chatterjee Purshottam Mulani

91 rows × 10 columns

In [144]:
# Specify the desired file path (replace with your actual path)
file_path = "C:\\Users\\acer\\Downloads\\CodeSoft Internship\\Movie Project\\top_rated_movies_per_year.xlsx"

# Write the DataFrame to the Excel file (without index)
top_rated_movies_per_year.to_excel(file_path, index=False)

print("Top rated movies per year saved to", file_path)
Top rated movies per year saved to C:\Users\acer\Downloads\CodeSoft Internship\Movie Project\top_rated_movies_per_year.xlsx

3.4 | Filter Movies By Different Directors

In [164]:
director_counts = movies['Director'].value_counts()

# Sort directors by movie count (descending order) and get the top 3
top_3_directors = director_counts.nlargest(3)

print("Top 3 directors with the most movies :")
top_3_directors
Top 3 directors with the most movies :
Out[164]:
K    115
S     78
R     40
Name: Director, dtype: int64

Top / Bottom Rated & Voted Movies By Different Directors

In [162]:
# Find the director with the highest rated movie
top_rated_movie = movies.sort_values(by='Rating', ascending=False).head(1)
top_rated_director = top_rated_movie['Director'].values[0]  # Extract director name

# Find the director with the lowest rated movie
bottom_rated_movie = movies.sort_values(by='Rating', ascending=True).head(1)
bottom_rated_director = bottom_rated_movie['Director'].values[0]  # Extract director name

# Find the director with the most voted movie
most_voted_movie = movies.sort_values(by='Votes', ascending=False).head(1)
most_voted_director = most_voted_movie['Director'].values[0]  # Extract director name

# Find the director with the least voted movie
least_voted_movie = movies.sort_values(by='Votes', ascending=True).head(1)
least_voted_director = least_voted_movie['Director'].values[0]  # Extract director name

print("\nDirector with the highest rated movie:", top_rated_director, "\n")
print(top_rated_movie[['Name', 'Year', 'Rating', 'Genre']].to_string(index=False))  # Display movie details
print('\n', '='*100, '\n')

print("\nDirector with the lowest rated movie:", bottom_rated_director , "\n")
print(bottom_rated_movie[['Name', 'Year', 'Rating', 'Genre']].to_string(index=False))  # Display movie details
print('\n', '='*100, '\n')

print("\nDirector with the most voted movie:", most_voted_director, "\n")
print(most_voted_movie[['Name', 'Year', 'Votes', 'Rating', 'Genre']].to_string(index=False))  # Display movie details
print('\n', '='*100, '\n')

print("\nDirector with the least voted movie:", least_voted_director, "\n")
print(least_voted_movie[['Name', 'Year', 'Votes', 'Rating', 'Genre']].to_string(index=False))  # Display movie details
Director with the highest rated movie: Saif Ali Sayeed 

           Name  Year  Rating Genre
Love Qubool Hai  2020    10.0 Drama

 ==================================================================================================== 


Director with the lowest rated movie: Mahesh Bhatt 

  Name  Year  Rating  Genre
Sadak   2020     1.1 Action

 ==================================================================================================== 


Director with the most voted movie: Ang Lee 

      Name  Year  Votes  Rating     Genre
Life of Pi  2012 591417     7.9 Adventure

 ==================================================================================================== 


Director with the least voted movie: Saif Ali Sayeed 

           Name  Year  Votes  Rating Genre
Love Qubool Hai  2020      5    10.0 Drama

Top/Bottom Rated & Voted Movies By Different Genre

In [166]:
# Count movies in each genre
genre_counts = movies['Genre'].value_counts()

# Get the top 3 genres with the most movies (using nlargest)
top_3_genres = genre_counts.nlargest(3)

print("Top 3 genres with the most movies:")
print(top_3_genres)
Top 3 genres with the most movies:
Drama     1617
Action    1389
Comedy     921
Name: Genre, dtype: int64
In [174]:
# Count movies in each genre
genre_counts = movies['Genre'].value_counts()

# Get the top 3 genres with the most movies (using nlargest)
top_3_genres = genre_counts.nlargest(3)

# Loop through top 3 genres
for i, genre in enumerate(top_3_genres.index):
  # Genre numbering and highlighting
  print(f"\n** Genre {i+1}: {genre.upper()} **")  # Uppercase for highlighting

  # Filter movies for the current genre
  genre_movies = movies[movies['Genre'] == genre]

  # Find the top rated movie within the genre
  top_rated_movie = genre_movies.sort_values(by='Rating', ascending=False).head(1)

  # Find the bottom rated movie within the genre
  bottom_rated_movie = genre_movies.sort_values(by='Rating', ascending=True).head(1)
  
  # Print results for the current genre
  print("\n  * Top Rated Movie:")
  print(top_rated_movie[['Name', 'Year', 'Rating']].to_string(index=False))  # Display movie details
  print('\n', '_'*100, '\n')
  print("\n  * Bottom Rated Movie:")
  print(bottom_rated_movie[['Name', 'Year', 'Rating']].to_string(index=False))  # Display movie details
  print('\n', '='*100, '\n')
** Genre 1: DRAMA **

  * Top Rated Movie:
           Name  Year  Rating
Love Qubool Hai  2020    10.0

 ____________________________________________________________________________________________________ 


  * Bottom Rated Movie:
                   Name  Year  Rating
Mumbai Can Dance Saalaa  2015     1.6

 ==================================================================================================== 


** Genre 2: ACTION **

  * Top Rated Movie:
       Name  Year  Rating
I'm in Love  2007     9.2

 ____________________________________________________________________________________________________ 


  * Bottom Rated Movie:
  Name  Year  Rating
Sadak   2020     1.1

 ==================================================================================================== 


** Genre 3: COMEDY **

  * Top Rated Movie:
        Name  Year  Rating
Love Sorries  2021     9.3

 ____________________________________________________________________________________________________ 


  * Bottom Rated Movie:
               Name  Year  Rating
Welcome to New York  2018     1.6

 ==================================================================================================== 

🎬 Movie Dataset Insights¶

🌟 Top Movies¶

By Votes¶

  • Life of Pi (2012) - 591,417 Votes, Rating: 7.9

By Rating¶

  • Love Qubool Hai (2020) - Rating: 10.0, 5 Votes

🎭 Director Insights¶

Top Director with Most Movies¶

  • Director K - 115 Movies

Director with Highest Rated Movie¶

  • Saif Ali Sayeed for Love Qubool Hai (2020) - Rating: 10.0

Director with Most Voted Movie¶

  • Ang Lee for Life of Pi (2012) - 591,417 Votes, Rating: 7.9

🎭📽️ Genre Insights¶

Top Rated Drama¶

  • Love Qubool Hai (2020) - Rating: 10.0

Top Rated Action¶

  • I'm in Love (2007) - Rating: 9.2

Top Rated Comedy¶

  • Love Sorries (2021) - Rating: 9.3

🎭👨‍👨‍👦‍👦 Actor Insights¶

Top Actor in Drama¶

  • Saif Ali Sayeed - Lead in Love Qubool Hai (2020)

Top Actor in Action¶

  • Mahesh Bhatt - Lead in Sadak (2020)

Top Actor in Comedy¶

  • Raj Banerjee - Lead in Welcome to New York (2018)

📊 Insights:

1️⃣ The dataset spans from 1931 onwards, with the shortest movie having a duration of just 45 minutes.

2️⃣ Drama emerges as the most prevalent genre, with actor Mithun leading in terms of appearances.

3️⃣ Analysis showcases both the highest-rated and the lowest-rated movies, along with their corresponding votes.

4️⃣ Directors' contributions vary significantly, with some directing a large number of movies while others direct fewer.

3.5 | Univariate Analysis

In [33]:
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'gray', 'yellow', 'cyan', 'black', 'olive' , 'lime', 'magenta', 'lightblue', 'lightcyan', 'lightgray', 'lightgreen', 'lightpurple', 'lightyellow', 'white']

3.5.1 | Top 5 Movies by Different Features

In [ ]:
def TopFivePlot(df, column, palette=None):
    if column not in df.columns:
        print(f"Error: '{column}' column not found in the DataFrame.")
        return
    
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x=column, order=df[column].value_counts().nlargest(5).index, edgecolor="white", palette=palette)
    plt.xticks(rotation=0)
    plt.title(f"Top Five {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.show()
In [167]:
TopFivePlot(movies, 'Actor 1', colors)
In [160]:
TopFivePlot(movies, 'Genre')
In [170]:
TopFivePlot(movies, 'Director', colors[7:])
In [184]:
TopFivePlot(movies, 'Rating', colors[4:])

3.5.2 | Movies By Distribution of Differnet Features

Movies By Years Distribution

In [22]:
# Define color sequence
colors = px.colors.qualitative.Plotly

# Create a histogram with probability density
fig_year = px.histogram(movies, x='Year', histnorm='probability density', nbins=30, color_discrete_sequence=colors)

# Update layout for aesthetics
fig_year.update_layout(
    title=dict(
        text='Distribution of Year',
        x=0.5,  # Center title horizontally
        pad=dict(t=20),  
        font=dict(size=20)  # Set title font size
    ),
    xaxis_title='Year',
    yaxis_title='Probability Density',
    xaxis=dict(showgrid=False),  # Hide x-axis grid lines
    yaxis=dict(showgrid=False),  # Hide y-axis grid lines
    bargap=0.02,  # Reduce bar spacing slightly
    plot_bgcolor='white'  # Set background color to white
)

# Display the histogram
fig_year.show()

Movies By Duration Distribution

In [23]:
fig_duration = px.histogram(movies, x = 'Duration', histnorm='probability density', nbins = 40, color_discrete_sequence = colors[2:3])
fig_duration.update_traces(selector=dict(type='histogram'))
fig_duration.update_layout(title='Distribution of Duration', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), bargap=0.02, plot_bgcolor = 'white')
fig_duration.show()

Movies By Rating Distribution

In [24]:
# Create a histogram plot for the 'Rating' column
fig_rating = px.histogram(
    movies,
    x='Rating',
    histnorm='probability density',
    nbins=70,
    color_discrete_sequence=colors[4:5]
)

# Update the traces to be histograms
fig_rating.update_traces(type='histogram')

# Update layout settings
fig_rating.update_layout(
    title='Distribution of Rating',
    title_x=0.5,
    title_font=dict(size=20),
    xaxis_title='Rating',
    yaxis_title='Probability Density',
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    bargap=0.02,
    plot_bgcolor='white'
)

# Display the histogram
fig_rating.show()

Movies By Votes Distribution

In [25]:
#Create a violin plot for the 'Votes' column
fig_votes = px.violin(
    movies,
    y='Votes', 
    color_discrete_sequence=colors[3:4]
)

# Update layout settings
fig_votes.update_layout(
    title='Distribution of Votes',
    title_x=0.5,
    xaxis_title='Probability Density',
    yaxis_title='Votes',
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    plot_bgcolor='white'
)

# Display the violin plot
fig_votes.show()

📊 Insights:

  • The distribution of Year is skewed towards left with a high concentration of movies directed in between 2015-2019

  • The duration of movies has a guassian distribution with a very few outliers

  • The distribution of Rating is also having a guassian distribution with a high concentration of 6.6 and 6.7

  • The number of votes has a plenty of outliers

3.6 | BI-Variate Analysis

3.6.1 | Top 10 Years By rating

In [26]:
# Group by year and calculate the mean rating
year_avg_rating = movies.groupby('Year')['Rating'].mean()

# Get top 10 years by average rating
top_10_years = year_avg_rating.nlargest(10)

# Create a bar plot for top 10 years by average rating
fig = px.bar(
    top_10_years.reset_index(),  # Reset index to convert series to DataFrame
    x='Year',
    y='Rating',
    title='Top 10 Years by Average Rating',
    color='Rating',  # Color by rating
    color_continuous_scale='darkmint'  # Specify color scale
)

# Update layout settings
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Average Rating',
    plot_bgcolor='white'
)

# Display the bar plot
fig.show()

3.6.2 | Trends In Rating By Diff Years

In [27]:
# Group data by Year and calculate the average rating
average_rating_by_year = movies.groupby('Year')['Rating'].mean().reset_index()

# Create the line plot with Plotly Express
fig = px.line(average_rating_by_year, x='Year', y='Rating', title='Trends in ratings across years', labels={'Rating': 'Average Rating'}, 
              template='plotly_white', color_discrete_sequence= colors[9:10])
fig.show()

3.6.3 | Trends In Votes By Years

In [28]:
colors = px.colors.qualitative.Plotly  # This is a built-in color sequence with enough elements

# Verify the length of the colors list
if len(colors) < 23:
    print(f"Colors list has only {len(colors)} elements, using default color sequence.")
    color_sequence = colors
else:
    color_sequence = colors[21:23]

# Group the data and calculate the average votes by year
average_votes_by_year = movies.groupby('Year')['Votes'].mean().reset_index()

# Create the line plot with Plotly Express
fig = px.line(average_votes_by_year, x='Year', y='Votes', title='Trends in votes across years',
              labels={'Votes': 'Average Votes'}, template='plotly_white', color_discrete_sequence=color_sequence)

fig.show()
Colors list has only 10 elements, using default color sequence.

3.6.4 | Top 3 Genre of Years By AVG-Rating

In [29]:
# Group data by Year and Genre and calculate the average rating
average_rating_by_year_genre = movies.groupby(['Year', 'Genre'])['Rating'].mean().reset_index()

# Get the top 3 genres
top_3_genres = movies['Genre'].value_counts().head(3).index

# Filter the data to include only the top 3 genres
average_rating_top_3_genres = average_rating_by_year_genre[average_rating_by_year_genre['Genre'].isin(top_3_genres)]

# Create the line plot with Plotly Express
fig = px.line(average_rating_top_3_genres, x='Year', y='Rating', color='Genre',
              title='Average Rating by Year for Top 3 Genres', 
              labels={'Year': 'Year', 'Rating': 'Average Rating'}, 
              color_discrete_sequence=colors[7:11],
              template='plotly_white')

# Show the plot
fig.show()

3.6.5 | Top 21 Directors By Movies CountOver The Years

In [30]:
# Top Directors Analysis
director_df = movies[['Director', 'Year']].dropna()
director_df['Movie_Count'] = 1

# Get the top 20 directors by movie count
top_21_directors = director_df['Director'].value_counts().head(21).index.tolist()
top_21_director_df = director_df[director_df['Director'].isin(top_21_directors)]

# Calculate count of movies for each director
top_21_director_count = top_21_director_df.groupby('Director')['Movie_Count'].sum().reset_index()

# Plot the top 20 directors by movie count over the years
fig = px.bar(top_21_director_count, x='Director', y='Movie_Count', color='Director',
             title='Top 21 Directors by Number of Movies Made Over the Years')

# Update layout
fig.update_layout(
    xaxis=dict(title='Director', tickfont_size=14, showgrid=False),  # Remove grid lines
    yaxis=dict(title='Number of Movies', tickfont_size=14, showgrid=False),  # Remove grid lines
    legend_title='Director',
    height=800,  # Increase plot height
    plot_bgcolor='white',  # Change background color
)

fig.show()

3.6.6 | Number of Movies Released By Year

In [31]:
# Plot the number of movies released by year
year_count = movies['Year'].value_counts().reset_index()
year_count.columns = ['Year', 'Count']
fig = px.bar(year_count, x='Year', y='Count', text='Count', title='Number of Movies Released by Year', template='plotly_white', color_discrete_sequence=colors[5:9])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(
    xaxis=dict(title='Year of Movie Release', titlefont_size=17, tickfont_size=17, showgrid=False),
    yaxis=dict(title='Count of Movies Released', titlefont_size=17, tickfont_size=17, showgrid=False)
)
fig.show()

3.7 | Imapct of Features On Rating

3.7.1 | Impact of Duration on Rating

In [32]:
# Group data by Duration and calculate the average rating
average_rating_by_duration = movies.groupby('Duration')['Rating'].mean().reset_index()
fig = px.line(average_rating_by_duration, x='Duration', y='Rating', 
              title='Impact of Movie Duration on Rating', 
              labels={'Duration': 'Duration (minutes)', 'Rating': 'Average Rating'}, 
              template='plotly_white', color_discrete_sequence= colors[1:3])

# Customize the layout
fig.update_layout(title_x=0.5, 
                  xaxis=dict(title='Duration of Movie in Minutes', showgrid=False), 
                  yaxis=dict(title='Average Rating', showgrid=False))

# Show the plot
fig.show()
In [33]:
fig_dur_rating = px.scatter(movies, x='Duration', y='Rating', trendline='ols', color='Rating', color_continuous_scale='RdBu')

# Update layout
fig_dur_rating.update_layout(
    title='Impact of Movie Duration on Rating',
    title_x=0.5, 
    title_pad=dict(t=20), 
    title_font=dict(size=20),
    xaxis_title='Duration of Movie (Minutes)',
    yaxis_title='Rating',
    xaxis=dict(showgrid=False), 
    yaxis=dict(showgrid=False), 
    plot_bgcolor='white',
    legend=dict(
        title='Rating',
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)',
        x=1.02, y=0.5,
        xanchor='left',
        font=dict(size=12)
    )
)

# Show the plot
fig_dur_rating.show()

3.7.2 | Imapct of Duration on Votes

In [34]:
# Group data by Duration and calculate the average votes
average_votes_by_duration = movies.groupby('Duration')['Votes'].mean().reset_index()

# Create the line plot with Plotly Express
fig = px.line(average_votes_by_duration, x='Duration', y='Votes', 
              title='Impact of Movie Duration on Votes', 
              labels={'Duration': 'Duration (minutes)', 'Votes': 'Average Votes'}, 
              template='plotly_white', color_discrete_sequence= colors[5:6])

# Customize the layout
fig.update_layout(title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), 
                  xaxis=dict(title='Duration of Movie in Minutes', showgrid=False), 
                  yaxis=dict(title='Average Votes', showgrid=False))

# Show the plot
fig.show()
In [39]:
fig_dur_votes = px.scatter(movies, x = 'Duration', y = 'Votes', trendline='ols', color = "Votes", color_continuous_scale = "darkmint")
fig_dur_votes.update_layout(title='Does length of movie have any impact on Votes?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration of Movie in Minutes', yaxis_title='Votes of a movie', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig_dur_votes.show()

3.7.3 | Imapct of Genre on Rating

In [35]:
# Group data by Genre and calculate the average rating
average_rating_by_genre = movies.groupby('Genre')['Rating'].mean().reset_index()

# Create the bar plot with Plotly Express
fig = px.bar(average_rating_by_genre, x='Genre', y='Rating', 
             title='Impact of Movie Genre on Rating', 
             labels={'Genre': 'Genre of Movie', 'Rating': 'Average Rating'}, 
             template='plotly_white', color_discrete_sequence= colors[6:])

# Customize the layout
fig.update_layout(title_x=0.5,
                  xaxis=dict(title='Genre of Movie', showgrid=False), 
                  yaxis=dict(title='Average Rating', showgrid=False))

# Show the plot
fig.show()
In [36]:
import plotly.express as px

# Group the data by 'Year' and 'Genre' and count the occurrences of each genre in each year
genre_count_by_year = movies.groupby(['Year', 'Genre']).size().reset_index(name='Count')

# Create line plot
fig_genre_trends = px.line(genre_count_by_year, x='Year', y='Count', color='Genre', title='Genre Trends Across Years',
                           line_shape='spline', render_mode='svg', color_discrete_sequence=['black', 'mintcream', 'red', 'yellow', 'green', 'purple'])

# Update layout
fig_genre_trends.update_layout(
    xaxis_title='Year',
    yaxis_title='Count',
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    legend_title='Genre',
    legend=dict(
        x=1.02,
        y=0.5,
        xanchor='left',
        font=dict(size=10)
    ),
    font=dict(
        family="Arial, sans-serif",
        size=12,
        color="black"
    ),
    title_font=dict(
        size=20,
        family="Arial, sans-serif",
        color="black"
    ),
    title_x=0.5,
    title_y=0.92,
    title_font_size=24
)

# Show the plot
fig_genre_trends.show()

3.7.5 | 3D Plot of Duration, Rating & Votes

In [37]:
# Duration, Rating, and Votes Analysis
dur_rat = movies[['Duration', 'Rating', 'Votes']].dropna()

# Inspecting the data
print(dur_rat['Duration'].unique())

# Clean and convert 'Duration' column
dur_rat['Duration'] = dur_rat['Duration'].astype(str).str.replace(' min', '')
dur_rat['Duration'] = dur_rat['Duration'].str.replace(r'[^\d.]', '', regex=True).astype(float).astype(int)

# Clean and convert 'Votes' column
dur_rat['Votes'] = dur_rat['Votes'].astype(str).str.replace(',', '').astype(float)

# 3D Scatter Plot
fig = px.scatter_3d(dur_rat, x='Duration', y='Rating', z='Votes', color='Rating', title='3D Plot of Duration, Rating, and Votes', color_continuous_scale='RdBu')
fig.show()
[109. 110. 147. 142.  82. 116.  96. 120. 161. 166. 102.  87. 132. 105.
 146. 168. 158.  94. 124. 157. 107. 113.  80. 122. 149. 130. 121. 126.
 188. 115. 103. 114. 170. 100.  99. 140. 128.  93. 125. 145.  75. 111.
 134. 104.  92. 136. 137. 127. 119.  90. 150. 151.  95. 112. 143. 177.
 117. 148. 123. 144.  72. 154. 175. 153.  78. 138. 139. 133. 180. 135.
 164. 162. 171. 160. 152. 163. 165. 141. 129. 156. 200. 172.  88. 155.
 167. 106. 193. 108. 195. 174.  81. 178. 184.  97. 176. 169. 131.  77.
  91.  86.  84. 173. 118. 181. 101.  79. 183. 159.  83.  89. 186.  74.
  85.  98.  76. 185. 187.  73. 191. 199. 179. 190. 189. 197. 182. 192.]

📊 Insights¶


🎬 The Golden Era:¶

  • The peak year for highest average ratings is 1944.

🎢 Ups and Downs:¶

  • A noticeable decline in ratings is observed from 1984 to 1993, coinciding with the fare.

🚀 Rising Stars:¶

  • Conversely, ratings have been consistently increasing since 2013.

📈 Audience Favorites:¶

  • In 2010, movies received the highest average votes.

🎭 Genre Charisma:¶

  • Drama genre maintains the highest average ratings since its inception.

🎭🤣 Lights, Camera, Laughter:¶

  • The Comedy genre emerged in 1953, while Action genre started in 1964.

⏳ Short and Sweet:¶

  • Short-duration movies tend to receive higher ratings and higher votes, suggesting a preference for concise storytelling.

⚙️ 4. Feature Engineering ⚙️

4.1 | Dropping Name

In [12]:
# Dropping non essential columns
movies.drop('Name', axis = 1, inplace = True)
In [13]:
movies.head()
Out[13]:
Year Duration Genre Rating Votes Director Actor 1 Actor 2 Actor 3
1 2019 109.0 Drama 7.0 8.0 Gaurav Bakshi Rasika Dugal Vivek Ghamande Arvind Jangid
3 2019 110.0 Comedy 4.4 35.0 Ovais Khan Prateik Ishita Raj Siddhant Kapoor
3 2019 110.0 Romance 4.4 35.0 Ovais Khan Prateik Ishita Raj Siddhant Kapoor
5 1997 147.0 Comedy 4.7 827.0 Rahul Rawail Bobby Deol Aishwarya Rai Bachchan Shammi Kapoor
5 1997 147.0 Drama 4.7 827.0 Rahul Rawail Bobby Deol Aishwarya Rai Bachchan Shammi Kapoor

4.2 | Converting Categorical Features

In [13]:
def add_mean_ratings(movies):

  genre_means = movies.groupby('Genre')['Rating'].mean()
  director_means = movies.groupby('Director')['Rating'].mean()
  actor_means = {col: movies.groupby(col)['Rating'].mean() for col in ['Actor 1', 'Actor 2', 'Actor 3']}

  movies['Genre_mean_rating'] = movies['Genre'].map(genre_means)
  movies['Director_encoded'] = movies['Director'].map(director_means)

  for actor, mean in actor_means.items():
    movies[f"{actor}_encoded"] = movies[actor].map(mean)

  return movies

df = movies.copy(deep=True)
movies_encoded = add_mean_ratings(df)
movies_encoded.head()
Out[13]:
Year Duration Genre Rating Votes Director Actor 1 Actor 2 Actor 3 Genre_mean_rating Director_encoded Actor 1_encoded Actor 2_encoded Actor 3_encoded
1 2019 109.0 Drama 7.0 8.0 Gaurav Bakshi Rasika Dugal Vivek Ghamande Arvind Jangid 6.056998 7.000000 6.850000 7.00 7.00
3 2019 110.0 Comedy 4.4 35.0 Ovais Khan Prateik Ishita Raj Siddhant Kapoor 5.752085 4.400000 5.250000 4.40 4.46
3 2019 110.0 Romance 4.4 35.0 Ovais Khan Prateik Ishita Raj Siddhant Kapoor 5.812772 4.400000 5.250000 4.40 4.46
5 1997 147.0 Comedy 4.7 827.0 Rahul Rawail Bobby Deol Aishwarya Rai Bachchan Shammi Kapoor 5.752085 5.335135 4.793617 5.73 5.93
5 1997 147.0 Drama 4.7 827.0 Rahul Rawail Bobby Deol Aishwarya Rai Bachchan Shammi Kapoor 6.056998 5.335135 4.793617 5.73 5.93

Feature Engineering Insights:¶

🛠️ Transformation:¶

  • Dropped Column:
    • The non-essential 'Name' column was dropped from the dataset.

📊 New Features Added:¶

  • Mean Ratings:
    • Mean ratings for each genre, director, and actor were calculated and included in the dataset.

This streamlined feature engineering process ensures a concise yet effective dataset for model training.

5.| Data Separating and Splitting 🪓

In [14]:
features = movies_encoded.drop(columns=['Rating', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Genre']).values
target = movies_encoded['Rating'].values


x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
In [15]:
print("Training set features shape:", x_train.shape)
print("Testing set features shape:", x_test.shape)
print("Training set target shape:", y_train.shape)
print("Testing set target shape:", y_test.shape)
Training set features shape: (9570, 8)
Testing set features shape: (2393, 8)
Training set target shape: (9570,)
Testing set target shape: (2393,)
In [ ]:
x_train_path = r'C:\Users\acer\Downloads\x_train.csv'
if os.path.exists(x_train_path):
    os.remove(x_train_path)
x_train_df = pd.DataFrame(data=x_train)
x_train_df.to_csv(x_train_path, index=False)
print("Training features saved successfully!")

# Save the testing features
x_test_path = r'C:\Users\acer\Downloads\x_test.csv'
if os.path.exists(x_test_path):
    os.remove(x_test_path)
x_test_df = pd.DataFrame(data=x_test)
x_test_df.to_csv(x_test_path, index=False)
print("Testing features saved successfully!")


# Save the training features
y_train_path = r'C:\Users\acer\Downloads\y_train.csv'
if os.path.exists(y_train_path):
    os.remove(y_train_path)
y_train_df = pd.DataFrame(data=y_train)
y_train_df.to_csv(y_train_path, index=False)
print("Training features saved successfully!")

# Save the testing features
y_test_path = r'C:\Users\acer\Downloads\y_test.csv'
if os.path.exists(y_test_path):
    os.remove(y_test_path)
y_test_df = pd.DataFrame(data=y_test)
y_test_df.to_csv(y_test_path, index=False)
print("Testing features saved successfully!")

6.| Feature Extraction ( Additional )

In [106]:
Kbest_reg = SelectKBest(score_func=f_regression, k=7)
Kbest_reg.fit(x_train, y_train)

# what are scores for the features
for i in range(len(Kbest_reg.scores_)):
    print(f'Feature {i} : {round(Kbest_reg.scores_[i],3)}')

print()

# plot the scores

plt.bar([i for i in range(len(Kbest_reg.scores_))], Kbest_reg.scores_)
plt.show()
Feature 0 : 545.877
Feature 1 : 16.196
Feature 2 : 229.87
Feature 3 : 517.18
Feature 4 : 17218.75
Feature 5 : 10987.842
Feature 6 : 12371.432
Feature 7 : 13242.967

In [20]:
x_train_reg = Kbest_reg.transform(x_train)
print("X_train.shape: {}".format(x_train.shape))
print()
print("X_train_selected.shape: {}".format(x_train_reg.shape))
print()
# transform test data
x_test_reg = Kbest_reg.transform(x_test)
X_train.shape: (9570, 8)

X_train_selected.shape: (9570, 7)

🌟 Top Features:¶

  • Feature Importance:
    • The top features contributing to model performance were identified:
      1. Genre_mean_rating
      2. Director_encoded
      3. Actor2_encoded
      4. Actor1_encoded
      5. Actor3_encoded

🎯7. | Model Building 🎯

Functions Used For Training, Evaluation And Prediction

In [20]:
def model_regressor(regressor, x_train, y_train, x_test, y_test):
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_test)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2_train = regressor.score(x_train, y_train)
    r2_test = r2_score(y_test, y_pred)
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared Score(R2 ) on Train Data: {r2_train:.4f}")
    print(f"R-squared Score(R2 ) on Test Data: {r2_test:.4f}")

    # Evaluation
    model_evaluation_visualize(regressor, x_test, y_test)

def model_evaluation_visualize(regressor, x_test, y_test):
    y_pred = regressor.predict(x_test)
    # Plot predictions vs actual values
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', edgecolors=(0, 0, 0))
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=3)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Actual vs Predicted Values')
    plt.show()

    
def cross_validation(regressor, x_train, y_train):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(regressor, x_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    return scores

def kfold_cross_validation(classifier, x_train, y_train, cv, scoring= 'neg_mean_absolute_error'):
    # Use stratified k-fold for classification problems if the classifier supports probability prediction
    if hasattr(classifier, 'predict_proba'):
        kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    else:
        kfold = KFold(n_splits=cv, shuffle=True, random_state=42)

    scores = []
    for train_index, test_index in kfold.split(x_train, y_train):
        x_train_fold, x_test_fold = x_train[train_index], x_train[test_index]
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

        classifier.fit(x_train_fold, y_train_fold)
        y_pred = classifier.predict(x_test_fold)

        score = scoring(y_test_fold, y_pred)  # Use the provided scoring function
        scores.append(score)

    return scores

Creating Model Using Diff Algorithms

In [18]:
lr = LinearRegression()
dtr = DecisionTreeRegressor(random_state=42)
ridge = Ridge(random_state=42)
gbr= GradientBoostingRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor()

⚡ 8. | Model Evaluation ⚡

8.1 | Random Forest Regressor

In [127]:
model_regressor(rf, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1185
Root Mean Squared Error (RMSE): 0.3442
Mean Absolute Error (MAE): 0.1858
R-squared Score(R2 ) on Train Data: 0.9921
R-squared Score(R2 ) on Test Data: 0.9376

8.2 | Linear Regression

In [22]:
model_regressor(lr, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.4566
Root Mean Squared Error (RMSE): 0.6758
Mean Absolute Error (MAE): 0.4991
R-squared Score(R2 ) on Train Data: 0.7721
R-squared Score(R2 ) on Test Data: 0.7595

8.3 | Ridge Regression

In [23]:
model_regressor(ridge, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.4566
Root Mean Squared Error (RMSE): 0.6758
Mean Absolute Error (MAE): 0.4991
R-squared Score(R2 ) on Train Data: 0.7721
R-squared Score(R2 ) on Test Data: 0.7595

8.4 | Gradient Booster Regressor

In [24]:
model_regressor(gbr, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.3392
Root Mean Squared Error (RMSE): 0.5824
Mean Absolute Error (MAE): 0.4197
R-squared Score(R2 ) on Train Data: 0.8549
R-squared Score(R2 ) on Test Data: 0.8213

8.5 | Decision Tree Regressor

In [21]:
model_regressor(dtr, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1389
Root Mean Squared Error (RMSE): 0.3727
Mean Absolute Error (MAE): 0.1230
R-squared Score(R2 ) on Train Data: 1.0000
R-squared Score(R2 ) on Test Data: 0.9262

8.6 |XBG Regressor

In [26]:
model_regressor(xgb, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1654
Root Mean Squared Error (RMSE): 0.4067
Mean Absolute Error (MAE): 0.2579
R-squared Score(R2 ) on Train Data: 0.9747
R-squared Score(R2 ) on Test Data: 0.9129
In [139]:
model_regressor(xgb, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1654
Root Mean Squared Error (RMSE): 0.4067
Mean Absolute Error (MAE): 0.2579
R-squared Score(R2 ) on Train Data: 0.9747
R-squared Score(R2 ) on Test Data: 0.9129

📊 Insights:

  • The performance evaluation revealed that Random Forest performed well than the Logistic Regression in terms of, r2_score which is 0.94

⚖️ 9. | Model Comparison ⚖️

Functions For Comparison ⚖️

In [16]:
def adjusted_r2_score(r2, n, k):
    return 1 - (1 - r2) * (n - 1) / (n - k - 1)

def compare_models_metrics(regressors, x_train, y_train, x_test, y_test):
    metrics = ['Train R2', 'Test R2', 'Train Adj R2', 'Test Adj R2', 'Train MSE', 'Test MSE', 'Train MAE', 'Test MAE', 'Cross Validation Mean Score', 'Cross Validation Std Score']
    metrics_values = []
    names = []

    for name, regressor in regressors:
        regressor.fit(x_train, y_train)
        prediction = regressor.predict(x_test)

        # Calculate metrics
        train_r2 = r2_score(y_train, regressor.predict(x_train))
        test_r2 = r2_score(y_test, prediction)
        n_train = len(y_train)
        n_test = len(y_test)
        k = x_train.shape[1]
        train_adj_r2 = adjusted_r2_score(train_r2, n_train, k)
        test_adj_r2 = adjusted_r2_score(test_r2, n_test, k)
        train_mse = mean_squared_error(y_train, regressor.predict(x_train))
        test_mse = mean_squared_error(y_test, prediction)
        train_mae = mean_absolute_error(y_train, regressor.predict(x_train))
        test_mae = mean_absolute_error(y_test, prediction)
        
        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
        cv_scores = cross_val_score(regressor, x_train, y_train, cv=cv, scoring='r2')
        cross_val_score_mean = cv_scores.mean()
        cross_val_score_std = cv_scores.std()

        metrics_values.append([train_r2, test_r2, train_adj_r2, test_adj_r2, train_mse, test_mse, train_mae, test_mae, cross_val_score_mean, cross_val_score_std])
        names.append(name)

        # Print metrics
        print('\n')
        print(f"Model: {name}")
        print(f"Train R2: {train_r2:.2f}")
        print(f"Test R2: {test_r2:.2f}")
        print(f"Train Adj R2: {train_adj_r2:.2f}")
        print(f"Test Adj R2: {test_adj_r2:.2f}")
        print(f"Train MSE: {train_mse:.2f}")
        print(f"Test MSE: {test_mse:.2f}")
        print(f"Train MAE: {train_mae:.2f}")
        print(f"Test MAE: {test_mae:.2f}")
        print(f"Cross Validation Mean Score: {cross_val_score_mean:.2f}")
        print(f"Cross Validation Std Score: {cross_val_score_std:.2f}")
        print("-" * 50, end="\n")

    df_metrics = pd.DataFrame(metrics_values, columns=metrics)
    df_metrics['Model'] = names
    return df_metrics

def compare_models_metrics_heatmap(df_metrics):
    metrics = ['Train R2', 'Test R2', 'Train Adj R2', 'Test Adj R2', 'Train MSE', 'Test MSE', 'Train MAE', 'Test MAE', 'Cross Validation Mean Score', 'Cross Validation Std Score']

    # Plot heatmap
    plt.figure(figsize=(13, 11))
    sns.heatmap(df_metrics.set_index('Model')[metrics], annot=True, cmap='YlGnBu', fmt=".2f")
    plt.title('Comparison of Regressor Metrics')
    plt.xlabel('Metric')
    plt.ylabel('Regressor')
    plt.tight_layout()
    plt.show()

9.1 | ⚖️ Metrics Comparison

In [ ]:
# Define regression models
regressors = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree Regressor', DecisionTreeRegressor(random_state=42)),
    ('Ridge', Ridge(random_state=42)),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=42)),
    ('Random Forest Regressor', RandomForestRegressor(random_state=42)),
    ('XGB Regressor', XGBRegressor(random_state=42))
]

# Example usage:
# Assuming x_train, y_train, x_test, y_test are defined and preprocessed
df_metrics = compare_models_metrics(regressors, x_train, y_train, x_test, y_test)

Model: Linear Regression
Train R2: 0.78
Test R2: 0.77
Train Adj R2: 0.78
Test Adj R2: 0.77
Train MSE: 0.40
Test MSE: 0.43
Train MAE: 0.48
Test MAE: 0.48
Cross Validation Mean Score: 0.78
Cross Validation Std Score: 0.02
--------------------------------------------------


Model: Decision Tree Regressor
Train R2: 1.00
Test R2: 0.93
Train Adj R2: 1.00
Test Adj R2: 0.93
Train MSE: 0.00
Test MSE: 0.14
Train MAE: 0.00
Test MAE: 0.12
Cross Validation Mean Score: 0.90
Cross Validation Std Score: 0.02
--------------------------------------------------


Model: Ridge
Train R2: 0.78
Test R2: 0.77
Train Adj R2: 0.78
Test Adj R2: 0.77
Train MSE: 0.40
Test MSE: 0.43
Train MAE: 0.48
Test MAE: 0.48
Cross Validation Mean Score: 0.78
Cross Validation Std Score: 0.02
--------------------------------------------------


Model: Gradient Boosting Regressor
Train R2: 0.85
Test R2: 0.81
Train Adj R2: 0.85
Test Adj R2: 0.81
Train MSE: 0.29
Test MSE: 0.35
Train MAE: 0.39
Test MAE: 0.43
Cross Validation Mean Score: 0.83
Cross Validation Std Score: 0.02
--------------------------------------------------
In [ ]:
compare_models_metrics_heatmap(df_metrics)

📊 Model Evaluation Insights:

We evaluated the performance of six different regression models for predicting movie ratings. Below are the evaluation metrics for each model: # Overall Conclusion 🤖 Here's the **Best Model** for each metric: - **Train R2**: `Decision Tree Regressor` - **Test R2**: `Random Forest Regressor` - **Train Adjusted R2**: `Decision Tree Regressor` - **Test Adjusted R2**: `Random Forest Regressor` - **Train MSE**: `Decision Tree Regressor` - **Test MSE**: `Random Forest Regressor` - **Train MAE**: `Decision Tree Regressor` - **Test MAE**: `Random Forest Regressor` - **Cross Validation Mean Score**: `Random Forest Regressor` - **Cross Validation Std Score**: `Random Forest Regressor`

9.2 | ⚖️ Residual Plots Comparison

In [104]:
# Define models
models = [
    ('LR', LinearRegression()),
    ('DTR', DecisionTreeRegressor(random_state=42)),
    ('Ridge', Ridge(random_state=42)),
    ('GBR', GradientBoostingRegressor(random_state=42)),
    ('RF', RandomForestRegressor(random_state=42)),
    ('XGB', XGBRegressor(random_state=42))
]

# Function to plot residuals
def plot_residuals(model, model_name):
    visualizer = ResidualsPlot(model, hist=True, figsize=(15, 6), colors=["black", "darkgoldenrod"])

    # Fit the visualizer with the training data
    visualizer.fit(x_train, y_train)

    # Evaluate the model on the test data
    visualizer.score(x_test, y_test)

    # Customizing the plot
    visualizer.ax.set_title(f"Residuals Plot for {model_name}")
    visualizer.ax.set_xlabel("Predicted Value")
    visualizer.ax.set_ylabel("Residuals")
    visualizer.ax.grid(True, which='both', linestyle='--', linewidth=0.5)

    # Render the plot
    visualizer.show()

# Plot residuals for all models
for name, model in models:
    print(f"Plotting residuals for {name}...")
    plot_residuals(model, name)
Plotting residuals for LR...
Plotting residuals for DTR...
Plotting residuals for Ridge...
Plotting residuals for GBR...
Plotting residuals for RF...
Plotting residuals for XGB...

🔮 10. | Making Prediction 🔮

10.1 | Making Prediction By Manually Giving Data

In [151]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore", message="X has feature names")


# Define your data
data = {
    'Year': [2019], 
    'Votes': [141], 
    'Duration': [3581], 
    'Genre_mean_rating': [5.6], 
    'Director_encoded': [6.83], 
    'Actor1_encoded': [6.2], 
    'Actor2_encoded': [5.1], 
    'Actor3_encoded': [6.4]
}

# Create DataFrame
df = pd.DataFrame(data)

# Define models
models = {
    'Linear Regression': lr,
    'Decision Tree Regressor': dtr,
    'Ridge': ridge,
    'Gradient Boosting Regressor': gbr,
    'Random Forest Regressor': rf,
    'XGBoost Regressor': xgb
}

# Make predictions using each model
for name, model in models.items():
    print(f"Predictions using {name}:")
    predicted_rating = model.predict(df)
    print("Predicted Rating:", predicted_rating[0])
    print()
    
    
# Define a dictionary to store predicted ratings for each model
predicted_ratings = {}

# Predict ratings for each model
for name, model in models.items():
    predicted_rating = model.predict(df)
    predicted_ratings[name] = predicted_rating[0]

# Find the model with the closest predicted rating to the original rating (6.6)
original_rating = 6.6
best_model = min(predicted_ratings, key=lambda x: abs(predicted_ratings[x] - original_rating))

print(f"The best model is {best_model} with predicted rating of {predicted_ratings[best_model]}")
Predictions using Linear Regression:
Predicted Rating: 6.3596194204496665

Predictions using Decision Tree Regressor:
Predicted Rating: 6.6

Predictions using Ridge:
Predicted Rating: 6.359568775313514

Predictions using Gradient Boosting Regressor:
Predicted Rating: 6.1947144654586825

Predictions using Random Forest Regressor:
Predicted Rating: 6.360000000000002

Predictions using XGBoost Regressor:
Predicted Rating: 5.972795

The best model is Decision Tree Regressor with predicted rating of 6.6

10.2 | Making Prediction By Taking Input Using Best Model(Only)

In [150]:
def predict_movie_rating(model, input_data):
    print("Input data:", input_data)
    # Convert input data to a list
    input_data_as_list = list(input_data.values())
    print("\nInput data as list:", input_data_as_list)
    # Reshape the list as we are predicting for only one instance
    input_data_reshaped = np.array(input_data_as_list).reshape(1, -1)
    print("\nInput data reshaped:", input_data_reshaped)
    # Make prediction using the model
    prediction = model.predict(input_data_reshaped)
    print("\n\n_________________________________\nPrediction:", prediction)
    # Return the prediction
    return prediction[0]

# Function to take input from the user
def get_user_input():
    year = int(input("Enter the year: "))
    votes = int(input("Enter the number of votes: "))
    duration = int(input("Enter the duration (in minutes): "))
    genre_mean_rating = float(input("Enter the genre mean rating: "))
    director_encoded = float(input("Enter the director encoded value: "))
    actor1_encoded = float(input("Enter the actor1 encoded value: "))
    actor2_encoded = float(input("Enter the actor2 encoded value: "))
    actor3_encoded = float(input("Enter the actor3 encoded value: "))

    # Convert all input data to a dictionary
    input_data = {
        'Year': year,
        'Votes': votes,
        'Duration': duration,
        'Genre_mean_rating': genre_mean_rating,
        'Director_encoded': director_encoded,
        'Actor1_encoded': actor1_encoded,
        'Actor2_encoded': actor2_encoded,
        'Actor3_encoded': actor3_encoded
    }
    return input_data

# Load your RandomForestRegressor model before making predictions
# rf = joblib.load('your_model_path.pkl')

# Example usage
input_data = get_user_input()
result = predict_movie_rating(dtr, input_data)
#data = {'Year': [2019], 'Votes': [141], 'Duration': [3581], 'Genre_mean_rating': [5.633333333], 'Director_encoded': [6.833333333], 'Actor1_encoded': [6.225806452], 'Actor2_encoded': [5.057462687], 'Actor3_encoded': [6.44]}

# Print results
print("\n\nIndividual Input Data:", input_data, "\n\nPrediction:", result)
Enter the year: 2010
Enter the number of votes: 141
Enter the duration (in minutes): 3581
Enter the genre mean rating: 5.6
Enter the director encoded value: 6.83
Enter the actor1 encoded value: 6.2
Enter the actor2 encoded value: 5.1
Enter the actor3 encoded value: 6.4
Input data: {'Year': 2010, 'Votes': 141, 'Duration': 3581, 'Genre_mean_rating': 5.6, 'Director_encoded': 6.83, 'Actor1_encoded': 6.2, 'Actor2_encoded': 5.1, 'Actor3_encoded': 6.4}

Input data as list: [2010, 141, 3581, 5.6, 6.83, 6.2, 5.1, 6.4]

Input data reshaped: [[2010.    141.   3581.      5.6     6.83    6.2     5.1     6.4 ]]


_________________________________
Prediction: [6.6]


Individual Input Data: {'Year': 2010, 'Votes': 141, 'Duration': 3581, 'Genre_mean_rating': 5.6, 'Director_encoded': 6.83, 'Actor1_encoded': 6.2, 'Actor2_encoded': 5.1, 'Actor3_encoded': 6.4} 

Prediction: 6.6
# 📊 Insights: ## Model Evaluation Insights ### Predictions from Different Models We utilized six different regression models to predict the rating of a movie based on various features. Here are the predicted ratings from each model: - **Linear Regression**: Predicted rating is X. - **Decision Tree Regressor**: Predicted rating is Y. - **Ridge Regression**: Predicted rating is Z. - **Gradient Boosting Regressor**: Predicted rating is A. - **Random Forest Regressor**: Predicted rating is B. - **XGBoost Regressor**: Predicted rating is C. ### Selection of the Best Model After evaluating the predictions made by each model, we aimed to determine the best model that provides the most accurate prediction for the movie rating. 1. The original rating of the movie was **6.6**. 2. Based on the predictions made by all models, the model with the closest predicted rating to the original rating was the *Decision Tree Regressor*. This model predicted a rating of **6.6**, which is the closest to the original rating of 6.6 among all models. Therefore, we have selected the **Decision Tree Regressor** as the best model for predicting movie ratings.

💾 11. | Saving Model for Deployment 💾

11.1 | DTR Regressor

In [22]:
## Saving Model
import joblib
filename = "DTR_Regressor.sav"
joblib.dump(dtr, open(filename, 'wb'))

loaded_model_DTR = joblib.load("DTR_Regressor.sav")
result = loaded_model_DTR.score(x_test, y_test)
print(result)
0.9262468361216416

11.2 | XGB Regressor

In [26]:
rf = RandomForestRegressor(random_state=42)
rf.fit(x_train, y_train)
rf.score(x_test, y_test)
Out[26]:
0.9383800492163734
In [27]:
## Saving Model
import joblib
filename = "RFR_Regressor.sav"
joblib.dump(rf, open(filename, 'wb'))

loaded_model_RFR = joblib.load("RFR_Regressor.sav")
result = loaded_model_RFR.score(x_test, y_test)
print(result)
0.9383800492163734
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: